#!/usr/bin/python
#coding: utf-8

import re
import urllib2

__author__ = u"吴修树"

class TBMM(object):

    def __init__(self):
        self.siteURL = 'http://mm.taobao.com/json/request_top_list.htm'
        # 防止访问识别
        self.headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
           'Referer' : 'http://mm.taobao.com/json/request_top_list.htm'}

    # 爬取网页
    def getPage(self, getPageIndex):
        url = self.siteURL + "?page=" + str(getPageIndex)
        # print url
        request = urllib2.Request(url, headers = self.headers)
        response = urllib2.urlopen(request)
        return response.read().decode("gbk").encode("utf-8")

    # 正则匹配
    def getContents(self, getPageIndex):
        page = self.getPage(getPageIndex)
        # print len(page)
        pattern = re.compile(r'<div class="pic s60".*?<img src="//(.*?)".*?<p class="top".*?<a.*?>(.*?)</a>.*?<em>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>.*?<p class="description">(.*?)</p>', re.S)
        items = re.findall(pattern, page)
        for item in items:
            self.saveImg("https://" + item[0], item[1].decode("utf-8") + ".jpg")
            self.saveBrief(item[1].decode("utf-8") + ".txt", item[1], item[2], item[3], item[4].strip())
            # print item[0], item[1].decode("utf-8"), item[2], item[3].decode("utf-8"), item[4].strip().decode("utf-8")

    #传入图片地址，文件名，保存单张图片
    def saveImg(self,imageURL,fileName):
        # print fileName
        picture = urllib2.urlopen(imageURL)
        # 读取图片的数据
        data = picture.read()
        with open(fileName, "wb") as f:
            f.write(data)
    
    # 保存基本信息
    def saveBrief(self, fileName, name, age, address, brief):
        with open(fileName, "w+") as f:
            f.write("姓名：" + name + "\n")
            f.write("年龄：" + age + "\n")
            f.write("住址：" + address + "\n")
            f.write("代言产品：" + brief + "\n")

if __name__ == "__main__":
    tbmm = TBMM()
    tbmm.getContents(1)